Required packages: ggthemes, fixest, broom, tidyverse
# load required packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
ggthemes, fixest, broom, tidyverse
)
To see how DD works in practice we will first generate a fake dataset where we know the true value of what we want to estimate. This is a good skill to have in order to make sure you understand how methods work, and that your code does what you want it to do. If you know what the result is and your code produces something different, your code is wrong.
To generate fake data we need to model the true data generating process. We will make this exactly the DD model, but with some added noise.
Any time you introduce randomness into your code, you will want to set a random number generator seed so it is reproducible.
# Initialize RNG
set.seed(123456)
Every time you run this notebook you will get the exact same results even though we are drawing “random” numbers.
The model generating our data will be very simple: 1. A time-invariant factor specific to both NY and PA 2. A period-specific factor common across both NY and PA 3. A treatment effect in NY after the fake policy is implemented 4. Some added random noise in outcomes
First we need to define the time-invariant factors.
# Fixed state characteristics: ny has more biodiversity
state_fes <- tribble(~state, ~state_fe,
"NY", 88,
"PA", 44)
Here we used a tribble, which is just a tibble but where we can write it out element-by-element. This is helpful when you need to initialize small data frames.
Next we need the common, period-specific factors.
# Period characteristics: biodiversity is declining
period_fes <- tribble(~period, ~period_fe,
"before", 341,
"after", 207)
Last, we need to define the size of the treatment effect.
# True effect of the policy: how many additional species are saved?
treatment_effect <- 55
Now we have defined all the parameters required to generate our data. Next, we need to actually generate the data. This takes a few steps.
First, we need to generate N observations for each combination of before/after and NY/PA. We can do this with crossing.
# All combos of states, periods, and observation numbers
bio_df <- crossing(
state = c("NY", "PA"),
period = c("before", "after"),
obs = 1:500
)
bio_df
## # A tibble: 2,000 x 3
## state period obs
## <chr> <chr> <int>
## 1 NY after 1
## 2 NY after 2
## 3 NY after 3
## 4 NY after 4
## 5 NY after 5
## 6 NY after 6
## 7 NY after 7
## 8 NY after 8
## 9 NY after 9
## 10 NY after 10
## # … with 1,990 more rows
Second, we need to bring in the time-invariant, and period-specific factors with join functions.
bio_df <- bio_df %>%
inner_join(period_fes) %>%
inner_join(state_fes)
## Joining, by = "period"
## Joining, by = "state"
bio_df
## # A tibble: 2,000 x 5
## state period obs period_fe state_fe
## <chr> <chr> <int> <dbl> <dbl>
## 1 NY after 1 207 88
## 2 NY after 2 207 88
## 3 NY after 3 207 88
## 4 NY after 4 207 88
## 5 NY after 5 207 88
## 6 NY after 6 207 88
## 7 NY after 7 207 88
## 8 NY after 8 207 88
## 9 NY after 9 207 88
## 10 NY after 10 207 88
## # … with 1,990 more rows
Third, it will be helpful to define a variable for units that are treated: New York and after.
bio_df <- bio_df %>%
mutate(treated = state == "NY" & period == "after",
period = factor(period, levels = c("before", "after")),
period = relevel(period, "before"))
bio_df
## # A tibble: 2,000 x 6
## state period obs period_fe state_fe treated
## <chr> <fct> <int> <dbl> <dbl> <lgl>
## 1 NY after 1 207 88 TRUE
## 2 NY after 2 207 88 TRUE
## 3 NY after 3 207 88 TRUE
## 4 NY after 4 207 88 TRUE
## 5 NY after 5 207 88 TRUE
## 6 NY after 6 207 88 TRUE
## 7 NY after 7 207 88 TRUE
## 8 NY after 8 207 88 TRUE
## 9 NY after 9 207 88 TRUE
## 10 NY after 10 207 88 TRUE
## # … with 1,990 more rows
Now we need to generate our data by adding the time-invariant effect state_fe, the period-specific effect period_fe, and some random noise given by rnorm. If the observation is New York after the policy we will also add treatment_effect.
bio_df <- bio_df %>%
mutate(
biodiversity = ifelse(
treated,
period_fe + state_fe + treatment_effect + 100*rnorm(n()),
period_fe + state_fe + 100*rnorm(n())
)
)
bio_df
## # A tibble: 2,000 x 7
## state period obs period_fe state_fe treated biodiversity
## <chr> <fct> <int> <dbl> <dbl> <lgl> <dbl>
## 1 NY after 1 207 88 TRUE 433.
## 2 NY after 2 207 88 TRUE 322.
## 3 NY after 3 207 88 TRUE 314.
## 4 NY after 4 207 88 TRUE 359.
## 5 NY after 5 207 88 TRUE 575.
## 6 NY after 6 207 88 TRUE 433.
## 7 NY after 7 207 88 TRUE 481.
## 8 NY after 8 207 88 TRUE 600.
## 9 NY after 9 207 88 TRUE 467.
## 10 NY after 10 207 88 TRUE 307.
## # … with 1,990 more rows
Let’s plot the data. New York is in black, Pennsylvania is in orange.
# Switch the order of the period variables so before comes first
bio_df$period = factor(bio_df$period, levels = c("before", "after"))
ggplot(bio_df, group = interaction(state, period)) +
geom_jitter(aes(x = period, y = biodiversity, color = interaction(state), shape = interaction(state)), size = 3) +
annotate("text", size = 8, label = "NY", x = 1, y = 700) +
annotate("text", size = 8, label = "PA", x = 1, y = 100, color = "orange") +
scale_color_colorblind() +
theme_minimal() +
scale_x_discrete(labels = c("Before Treatment", "After Treatment")) +
scale_y_continuous(limits = c(50, 700)) +
labs(
x = "Time",
y = "Biodiversity",
title = "The raw fake data"
) +
theme(
legend.position = "none",
title = element_text(size = 24),
axis.text.x = element_text(size = 24), axis.text.y = element_text(size = 24),
axis.title.x = element_text(size = 24), axis.title.y = element_text(size = 24),
panel.grid.minor.x = element_blank(), panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
axis.line = element_line(colour = "black")
)
## Warning: Removed 13 rows containing missing values (geom_point).
Next we can estimate the effect using DD.
# Outcome ~ treatment + period fixed effect + state fixed effect, data
estimates <- fixest::feols(
biodiversity ~ treated + period + state,
data = bio_df
) %>%
broom::tidy()
estimates
## # A tibble: 4 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 436. 4.43 98.3 0.
## 2 treatedTRUE 54.7 8.87 6.17 8.14e-10
## 3 periodafter -139. 6.27 -22.2 7.41e-98
## 4 statePA -48.0 6.27 -7.65 3.04e-14
Compare the true values (left) versus the estimates (right). DD correctly recovers the true treatment effect!
c(treatment_effect, estimates$estimate[2]) # treatment_effect
## [1] 55.00000 54.72843
c(period_fes$period_fe[period_fes$period == "after"] - period_fes$period_fe[period_fes$period == "before"], estimates$estimate[3]) # period_after
## [1] -134.0000 -139.2645
c(state_fes$state_fe[state_fes$state == "PA"] - state_fes$state_fe[state_fes$state == "NY"], estimates$estimate[4]) # statePA
## [1] -44.00000 -47.98205
Now let’s look at what happens if take more naive approaches that do not address time-invariant factors, or common-period specific factors. We will look at how the failure to address these things biases our results in predictible ways.
Suppose we just regressed biodiversity on the existence of the conservation policy.
# Outcome ~ treatment, data
fixest::feols(
biodiversity ~ treated,
data = bio_df
) %>%
broom::tidy()
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 357. 3.11 115. 0
## 2 treatedTRUE -6.13 6.23 -0.983 0.326
We underestimate the true effect because we did not difference out the common decline in biodiversity over time. The estimated effect of the policy in NY is confounded by the common decline in biodiversity across both states.
What if we controlled for these common, period-specific factors?
# Outcome ~ treatment + period fixed effects, data
fixest::feols(
biodiversity ~ treated + period,
data = bio_df
) %>%
broom::tidy()
## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 412. 3.18 129. 0.
## 2 treatedTRUE 103. 6.36 16.2 3.06e- 55
## 3 periodafter -163. 5.51 -29.6 2.47e-160
Now we overestimate the true effect. We (still) did not difference out the fact that NY tends to have higher levels of biodiversity. The estimated treatment effect is biased upward because NY is just more biodiverse even without the policy.
Now lets work through the HR data to see how they got their results. We will just start with the cleaned data instead of raw. Cleaning the data is a huge chunk of the actual work of doing research, but takes a lot of code/time.
First, load the data and look at it.
########################################
## Read in blood lead data
bll_df <- read_csv("data/hr_2021_bll.csv")
## Parsed with column specification:
## cols(
## state = col_double(),
## county = col_double(),
## percent_high_conditional_tested = col_double(),
## ihs_bll = col_double(),
## race = col_double(),
## year = col_double(),
## weight = col_double(),
## unemp_rate = col_double(),
## median_income = col_double(),
## percent_non_white = col_double(),
## lead_emitted = col_double(),
## ap = col_double(),
## state_county = col_double()
## )
########################################
## Look at data
bll_df
## # A tibble: 22,887 x 13
## state county percent_high_co… ihs_bll race year weight unemp_rate median_income percent_non_whi… lead_emitted ap
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 0.752 0.695 0 2015 11.5 0.052 56580 0.213 4650 64263
## 2 1 1 0 0 0 2014 5.39 0.059 54366 0.207 4750. 0
## 3 1 1 0 0 0 2013 4.47 0.062 51868 0.205 4835 55932
## 4 1 1 0 0 0 2012 3.16 0.069 51441 0.203 4513. 0
## 5 1 1 0 0 0 2011 6 0.083 48863 0.200 4929. 0
## 6 1 1 0 0 0 2010 13.4 0.089 53049 0.198 5058. 0
## 7 1 1 1.14 0.979 0 2009 13.2 0.0970 53081 0.198 4827. 0
## 8 1 1 0.543 0.520 0 2008 13.6 0.051 51622 0.196 4455. 0
## 9 1 1 0.610 0.577 0 2007 12.8 0.033 50375 0.194 4672. 0
## 10 1 1 4.44 2.20 0 2006 6.71 0.033 46491 0.193 4951 76206
## # … with 22,877 more rows, and 1 more variable: state_county <dbl>
What are the variables?
state: state fips codecounty: county fips codestate_county: full fips codeFirst, let’s plot the raw data. This is something you should generally always do. It lets you see what is actually happening, whether you made data cleaning errors, etc. If what you’re trying to estimate pops up in the raw data that’s a good sign.
Here we will be plotting the raw data with two changes: 1. Only plot for 2005-2009. After 2009 some counties stop reporting lead poisoning, so the composition of the sample changes, making the comparison tricky. 2. Plot the weighted mean lead poisoning rate where we weight by the number of children tested. The idea here is that counties with more testing report estimated rates that less noisy measures of the true rate. We want these counties to count more toward our nationwide average.
bll_df %>%
group_by(race, year) %>%
summarise(percent_high_conditional_tested = weighted.mean(percent_high_conditional_tested, weight, na.rm = T)) %>%
filter(year >= 2005 & year <= 2009) %>%
ggplot(aes(x = year, y = percent_high_conditional_tested)) +
geom_vline(xintercept = 2006.5, color = "grey55", linetype = "dashed") +
geom_point(aes(shape = as.factor(race), color = as.factor(race)), size = 5) +
annotate("text", size = 4, label = "Treated", x = 2005.25, y = 2.85) +
annotate("text", size = 4, label = "Border", x = 2005.25, y = 2.5) +
annotate("text", size = 4, label = "Control", x = 2005.25, y = 1.4) +
annotate("text", size = 6, label = "Leaded Fuel", x = 2005.50, y = 0.25) +
annotate("text", size = 6, label = "Unleaded Fuel", x = 2008, y = 0.25) +
theme_minimal() +
scale_color_colorblind() +
labs(
x = "Year",
y = "Percent Lead Poisoned",
title = "Average lead poisoning rates by type (balanced panel)"
) +
theme(
legend.position = "none",
title = element_text(size = 14),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14),
panel.grid.minor.x = element_blank(), panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
axis.line = element_line(colour = "black")
)
########################################
## Difference-in-differences estimate
fixest::feols(
ihs_bll ~ as.factor(race)*I(year > 2007) +
as.factor(state)*year +
unemp_rate + median_income + percent_non_white + lead_emitted + ap | state_county + year,
weights = ~weight,
data = bll_df) %>%
broom::tidy(cluster = "state")
## Variables 'I(year > 2007)TRUE', 'as.factor(state)4' and 37 others have been removed because of collinearity (see $collin.var).
## # A tibble: 44 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 as.factor(race)1 1.27e-2 0.0605 0.210 0.834
## 2 as.factor(race)2 1.08e-1 0.115 0.941 0.347
## 3 unemp_rate 1.45e-1 0.402 0.360 0.719
## 4 median_income 1.59e-6 0.00000252 0.633 0.527
## 5 percent_non_white 1.69e+0 0.887 1.90 0.0571
## 6 lead_emitted 8.45e-9 0.00000000307 2.75 0.00596
## 7 ap 1.01e-7 0.0000000542 1.86 0.0630
## 8 as.factor(race)1:I(year > 2007)TRUE -3.31e-2 0.0439 -0.754 0.451
## 9 as.factor(race)2:I(year > 2007)TRUE -1.18e-1 0.0587 -2.01 0.0440
## 10 as.factor(state)4:year 5.88e-3 0.00307 1.92 0.0551
## # … with 34 more rows
########################################
## Event study
# Need to factor year so we can omit 2007
bll_df$year <- factor(bll_df$year, ordered = FALSE)
bll_df$year <- relevel(bll_df$year, 3)
# Estimate event study
event_study <- fixest::feols(
ihs_bll ~ as.factor(race)*year +
as.factor(state)*year +
unemp_rate + median_income + percent_non_white + lead_emitted + ap | state_county + year,
weights = ~weight,
data = bll_df) %>%
tidy(cluster = "state") %>%
filter(str_detect(term, "^as.factor\\(race\\)2:year")) %>%
add_row(estimate = 0, std.error = 0, .after = 2) %>%
mutate(year = row_number() + 2004)
## Variables 'year2005', 'year2006' and 98 others have been removed because of collinearity (see $collin.var).
# Plot event study
ggplot(data = event_study, aes(x = year, y = estimate)) +
geom_hline(yintercept = 0, size = 0.5) +
geom_vline(xintercept = 2006.5, color = "grey55", linetype = "dashed") +
geom_ribbon(aes(ymin = estimate - 1.96*std.error, ymax = estimate + 1.96*std.error),
fill = "darkslateblue", alpha = 0.4) +
geom_line(size = 1) +
geom_point(size = 5) +
annotate("text", size = 4, label = "Leaded Fuel", x = 2005.50, y = -0.26) +
annotate("text", size = 4, label = "Unleaded Fuel", x = 2012, y = -0.26) +
theme_minimal() +
scale_color_colorblind() +
scale_x_continuous(breaks = seq(2005, 2015, 2)) +
labs(
x = "Year",
y = "Percent Lead Poisoned",
title = "Percent lead poisoned relative to 2007 (first unleaded year)"
) +
theme(
legend.position = "none",
title = element_text(size = 14),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14),
panel.grid.minor.x = element_blank(), panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
axis.line = element_line(colour = "black")
)
########################################
## Read in mortality data
mortality_df <- read_csv("data/hr_2021_mortality.csv")
## Parsed with column specification:
## cols(
## state = col_double(),
## county = col_double(),
## age_adjusted_rate = col_double(),
## race = col_double(),
## year = col_double(),
## unemp_rate = col_double(),
## median_income = col_double(),
## percent_non_white = col_double(),
## lead_emitted = col_double(),
## ap = col_double(),
## pop_total = col_double(),
## weight = col_double(),
## state_county = col_double()
## )
mortality_df
## # A tibble: 58,107 x 13
## state county age_adjusted_ra… race year unemp_rate median_income
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 4856. 0 2017 NA NA
## 2 1 1 4967. 0 2016 0.051 54487
## 3 1 1 5118. 0 2015 0.052 56580
## 4 1 1 4651. 0 2014 0.059 54366
## 5 1 1 5460 0 2013 0.062 51868
## 6 1 1 5863. 0 2012 0.069 51441
## 7 1 1 4920. 0 2011 0.083 48863
## 8 1 1 5154. 0 2010 0.089 53049
## 9 1 1 4932. 0 2009 0.0970 53081
## 10 1 1 5705. 0 2008 0.051 51622
## # … with 58,097 more rows, and 6 more variables: percent_non_white <dbl>,
## # lead_emitted <dbl>, ap <dbl>, pop_total <dbl>, weight <dbl>,
## # state_county <dbl>
# Need to factor year so we can omit 2007
mortality_df$year <- factor(mortality_df$year, ordered = FALSE)
mortality_df$year <- relevel(mortality_df$year, 9)
# Estimate event study
event_study <- fixest::feols(
age_adjusted_rate ~ as.factor(race)*year +
as.factor(state)*year +
unemp_rate + median_income + percent_non_white + lead_emitted + ap | state_county + year,
weights = ~weight,
data = mortality_df) %>%
broom::tidy(cluster = "state") %>%
filter(str_detect(term, "^as.factor\\(race\\)2:year")) %>%
add_row(estimate = 0, std.error = 0, .after = 8) %>%
mutate(year = row_number() + 1998)
## NOTES: 1,870 observations removed because of 0-weight.
## 4,047 observations removed because of NA values (Breakup: LHS: 2,212, RHS: 1,885, Fixed-effects: 0, Weights: 0).
## Variables 'year1999', 'year2000' and 65 others have been removed because of collinearity (see $collin.var).
# Plot event study
ggplot(data = event_study, aes(x = year, y = estimate)) +
geom_hline(yintercept = 0, size = 0.5) +
geom_vline(xintercept = 2006.5, color = "grey55", linetype = "dashed") +
geom_ribbon(aes(ymin = estimate - 1.96*std.error, ymax = estimate + 1.96*std.error),
fill = "darkslateblue", alpha = 0.4) +
geom_line(size = 1) +
geom_point(size = 5) +
annotate("text", size = 4, label = "Leaded Fuel", x = 2002, y = -200) +
annotate("text", size = 4, label = "Unleaded Fuel", x = 2012, y = -200) +
theme_minimal() +
scale_color_colorblind() +
scale_x_continuous(breaks = seq(1999, 2015, 2)) +
labs(
x = "Year",
y = "Age-Adjusted Mortality Rate (deaths per 100,000)",
title = "Mortality rate relative to 2007 (first unleaded year)"
) +
theme(
legend.position = "none",
title = element_text(size = 14),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14),
panel.grid.minor.x = element_blank(), panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
axis.line = element_line(colour = "black")
)